We describe the analyses of the multi-region colorectal adenocarcinoma Set7 (4 biopsies) and Set6 (6 biopsies), for which we have generated new WGS sequencing data (~100x median coverage). The data - purity, single nucleotide variants, copy number calls and clustering results - are available in Supplementary Table 2 (Excel).
The data for these tumours has been first sequenced, at lower coverage, in Cross W, et al. The evolutionary landscape of colorectal tumorigenesis. Nat Ecol Evol. 2018;2(10):1661–1672.
To implement this analysis we use the following packages:
Please refer to the webpage of each one of them for installation instructions.
We discuss the analysis for Set7, one of the two colorectal samples. The output RDS objects generated by this vignette are:
You need some auxiliary functions: auxiliary_functions.R
# Packages that are required specifically for the analysis
require(CNAqc)
require(mobster)
require(VIBER)
require(dplyr)
# Source a bunch of auxiliary functions
source('auxiliary_functions.R', verbose = FALSE)
We begin loading the data from the CSV files "Set7_mutations.csv" and "Set7_cna.csv", which are released along with the paper.
dir.create("./Set7/")
###########################v
# Load data from CSV files #
############################
# Sample names and purity (from previous analysis we know purity)
Set7_samples = paste0('Set7_', c(55, 57, 59, 62))
Set7_purity = pio:::nmfy(Set7_samples, c(.88, .88, .88, .8))
# We load SNV data for diploid regions that map to CNA segmentes with >500 mutations.
# These have been precomputed with the functions of the CNAqc package
Set7_mutations = readr::read_csv("./data/Set7_mutations.csv", col_types = readr::cols())
print(Set7_mutations)
#> # A tibble: 52,274 x 28
#> chr from to Set7_55.DP Set7_57.DP Set7_59.DP Set7_62.DP Set7_55.NV
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 chr1 8.36e7 8.36e7 73 66 84 82 35
#> 2 chr16 4.84e7 4.84e7 48 67 61 75 7
#> 3 chr16 4.88e7 4.88e7 66 82 63 73 0
#> 4 chr16 4.89e7 4.89e7 105 69 75 94 0
#> 5 chr16 4.92e7 4.92e7 50 61 54 68 0
#> 6 chr16 4.92e7 4.92e7 87 81 88 91 0
#> 7 chr16 4.92e7 4.92e7 94 91 90 91 0
#> 8 chr16 4.92e7 4.92e7 51 34 47 41 5
#> 9 chr16 4.92e7 4.92e7 88 100 88 100 0
#> 10 chr16 4.92e7 4.92e7 86 78 83 106 0
#> # … with 52,264 more rows, and 20 more variables: Set7_57.NV <dbl>,
#> # Set7_59.NV <dbl>, Set7_62.NV <dbl>, Set7_55_N.VAF <dbl>,
#> # Set7_57_N.VAF <dbl>, Set7_59_N.VAF <dbl>, Set7_62_N.VAF <dbl>,
#> # Set7_55.VAF <dbl>, Set7_57.VAF <dbl>, Set7_59.VAF <dbl>, Set7_62.VAF <dbl>,
#> # alt <chr>, cosmic <chr>, function. <chr>, gene <chr>, mutlocation <chr>,
#> # ref <chr>, region <chr>, vartype <chr>, patient <chr>
# Load CNA data for all segments (not just diploid)
Set7_CNA = readr::read_csv("./data/Set7_cna.csv", col_types = readr::cols())
print(Set7_CNA)
#> # A tibble: 60 x 20
#> chr from length to Set7_55.minor Set7_55.Major Set7_56.minor
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 chr1 1.58e6 1583 2.26e8 1 1 1
#> 2 chr1 2.27e8 3 2.27e8 1 1 1
#> 3 chr1 2.27e8 151 2.48e8 1 1 1
#> 4 chr2 2.80e5 1435 2.42e8 1 1 1
#> 5 chr3 3.86e5 763 1.41e8 1 1 1
#> 6 chr3 1.41e8 204 1.81e8 1 2 1
#> 7 chr3 1.83e8 161 1.97e8 1 1 1
#> 8 chr4 7.20e5 39 6.32e6 1 1 1
#> 9 chr4 7.70e6 794 1.88e8 1 1 1
#> 10 chr5 1.71e6 288 6.51e7 1 1 1
#> # … with 50 more rows, and 13 more variables: Set7_56.Major <dbl>,
#> # Set7_57.minor <dbl>, Set7_57.Major <dbl>, Set7_58.minor <dbl>,
#> # Set7_58.Major <dbl>, Set7_59.minor <dbl>, Set7_59.Major <dbl>,
#> # Set7_60.minor <dbl>, Set7_60.Major <dbl>, Set7_61.minor <dbl>,
#> # Set7_61.Major <dbl>, Set7_62.minor <dbl>, Set7_62.Major <dbl>
We use the CNAqc package to visualise and visualise the CNA segments and the VAF distribution.
plot_calls(
samples = Set7_samples,
CNA_calls = Set7_CNA,
mutation_calls = Set7_mutations,
purities = Set7_purity
)
#> [ CNAqc - CNA Quality Check ]
#> ℹ Using reference genome coordinates for: GRCh38.
#> ! Missing CCF column from CNA calls, adding CCF = 1 assuming clonal CNA calls.
#> ! Missing segments length from CNA calls, adding it to CNA calls.
#> ℹ Input n = 52274 mutations for 60 CNA segments (60 clonal, 0 subclonal)
#> ✓ Mapped n = 52274 mutations to clonal segments (100% of input)
#> [ CNAqc - CNA Quality Check ]
#> ℹ Using reference genome coordinates for: GRCh38.
#> ! Missing CCF column from CNA calls, adding CCF = 1 assuming clonal CNA calls.
#> ! Missing segments length from CNA calls, adding it to CNA calls.
#> ℹ Input n = 52274 mutations for 60 CNA segments (60 clonal, 0 subclonal)
#> ✓ Mapped n = 52274 mutations to clonal segments (100% of input)
#> [ CNAqc - CNA Quality Check ]
#> ℹ Using reference genome coordinates for: GRCh38.
#> ! Missing CCF column from CNA calls, adding CCF = 1 assuming clonal CNA calls.
#> ! Missing segments length from CNA calls, adding it to CNA calls.
#> ℹ Input n = 52274 mutations for 60 CNA segments (60 clonal, 0 subclonal)
#> ✓ Mapped n = 52274 mutations to clonal segments (100% of input)
#> [ CNAqc - CNA Quality Check ]
#> ℹ Using reference genome coordinates for: GRCh38.
#> ! Missing CCF column from CNA calls, adding CCF = 1 assuming clonal CNA calls.
#> ! Missing segments length from CNA calls, adding it to CNA calls.
#> ℹ Input n = 52274 mutations for 60 CNA segments (60 clonal, 0 subclonal)
#> ✓ Mapped n = 52274 mutations to clonal segments (100% of input)
#> Scale for 'x' is already present. Adding another scale for 'x', which will
#> replace the existing scale.
#> Scale for 'x' is already present. Adding another scale for 'x', which will
#> replace the existing scale.
#> Scale for 'x' is already present. Adding another scale for 'x', which will
#> replace the existing scale.
#> Scale for 'x' is already present. Adding another scale for 'x', which will
#> replace the existing scale.
Fit each sample with MOBSTER using the raw VAF data, save the RDS output linked to this vignette, and plot the fits as (2x2 matrix).
mobster_fits = fit_mobsters(Set7_mutations, Set7_samples)
#> [ MOBSTER fit ]
#> ✔ Loaded input data, n = 22928.
#> ❯ n = 22928. Mixture with k = 1,2 Beta(s). Pareto tail: TRUE and FALSE. Output clusters with π > 0.02 and n >
#> 10.
#> ❯ Custom fit by Moments-matching in up to 150 steps, with ε = 1e-06 and peaks initialisation.
#> ❯ Scoring (without parallel) 1 x 2 x 2 = 4 models by ICL.
#> ℹ MOBSTER fits completed in 24.4s.
#> ── [ MOBSTER ] Set7_55 n = 22928 with k = 1 Beta(s) and a tail ──────────────────────────────────────────────────
#> ● Clusters: π = 51% [Tail] and 49% [C1], with π > 0.
#> ● Tail [n = 11360, 51%] with alpha = 1.7.
#> ● Beta C1 [n = 11568, 49%] with mean = 0.5.
#> ℹ Score(s): NLL = -20588.26; ICL = -39265.18 (-41116.28), H = 1851.1 (0). Fit converged by MM in 16 steps.
#> [ MOBSTER fit ]
#> ✔ Loaded input data, n = 21657.
#> ❯ n = 21657. Mixture with k = 1,2 Beta(s). Pareto tail: TRUE and FALSE. Output clusters with π > 0.02 and n >
#> 10.
#> ❯ Custom fit by Moments-matching in up to 150 steps, with ε = 1e-06 and peaks initialisation.
#> ❯ Scoring (without parallel) 1 x 2 x 2 = 4 models by ICL.
#> ℹ MOBSTER fits completed in 22.8s.
#> ── [ MOBSTER ] Set7_57 n = 21657 with k = 1 Beta(s) and a tail ──────────────────────────────────────────────────
#> ● Clusters: π = 65% [Tail] and 35% [C1], with π > 0.
#> ● Tail [n = 13463, 65%] with alpha = 1.4.
#> ● Beta C1 [n = 8194, 35%] with mean = 0.49.
#> ℹ Score(s): NLL = -19421.46; ICL = -36444.05 (-38783.03), H = 2338.97 (0). Fit converged by MM in 21 steps.
#> [ MOBSTER fit ]
#> ✔ Loaded input data, n = 25104.
#> ❯ n = 25104. Mixture with k = 1,2 Beta(s). Pareto tail: TRUE and FALSE. Output clusters with π > 0.02 and n >
#> 10.
#> ❯ Custom fit by Moments-matching in up to 150 steps, with ε = 1e-06 and peaks initialisation.
#> ❯ Scoring (without parallel) 1 x 2 x 2 = 4 models by ICL.
#> ℹ MOBSTER fits completed in 21.9s.
#> ── [ MOBSTER ] Set7_59 n = 25104 with k = 1 Beta(s) and a tail ──────────────────────────────────────────────────
#> ● Clusters: π = 67% [Tail] and 33% [C1], with π > 0.
#> ● Tail [n = 16463, 67%] with alpha = 1.8.
#> ● Beta C1 [n = 8641, 33%] with mean = 0.49.
#> ℹ Score(s): NLL = -28392.22; ICL = -54891.5 (-56723.65), H = 1832.15 (0). Fit converged by MM in 14 steps.
#> [ MOBSTER fit ]
#> ✔ Loaded input data, n = 22396.
#> ❯ n = 22396. Mixture with k = 1,2 Beta(s). Pareto tail: TRUE and FALSE. Output clusters with π > 0.02 and n >
#> 10.
#> ❯ Custom fit by Moments-matching in up to 150 steps, with ε = 1e-06 and peaks initialisation.
#> ❯ Scoring (without parallel) 1 x 2 x 2 = 4 models by ICL.
#> ℹ MOBSTER fits completed in 22.3s.
#> ── [ MOBSTER ] Set7_62 n = 22396 with k = 1 Beta(s) and a tail ──────────────────────────────────────────────────
#> ● Clusters: π = 71% [Tail] and 29% [C1], with π > 0.
#> ● Tail [n = 15396, 71%] with alpha = 1.4.
#> ● Beta C1 [n = 7000, 29%] with mean = 0.5.
#> ℹ Score(s): NLL = -20370.75; ICL = -38138.67 (-40681.4), H = 2542.73 (0). Fit converged by MM in 22 steps.
# Save RDS
saveRDS(mobster_fits, file = "./Set7/Set7_mobster_fits.rds")
# Plot a 2x2 figure
ggarrange(plotlist = lapply(mobster_fits, plot), ncol = 2, nrow = 2)
We extract non-tail mutations, i.e., mutations never assigned to a tail.
# From the MOBSTER fits
non_tail_mutations = get_nontail_mutations(mutations = Set7_mutations, mobster_fits)
print(non_tail_mutations)
#> # A tibble: 11,304 x 29
#> chr from to Set7_55.DP Set7_57.DP Set7_59.DP Set7_62.DP Set7_55.NV
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 chr1 8.36e7 8.36e7 73 66 84 82 35
#> 2 chr16 7.11e7 7.11e7 69 78 58 68 39
#> 3 chr1 8.38e7 8.38e7 75 71 94 87 32
#> 4 chr1 8.39e7 8.39e7 85 73 86 71 37
#> 5 chr17 4.88e7 4.88e7 73 54 57 98 39
#> 6 chr1 8.40e7 8.40e7 77 66 84 88 27
#> 7 chr17 7.16e7 7.16e7 53 51 68 71 21
#> 8 chr1 8.41e7 8.41e7 76 80 94 103 40
#> 9 chr1 6.81e6 6.81e6 76 82 94 100 27
#> 10 chr1 8.42e7 8.42e7 66 78 88 90 37
#> # … with 11,294 more rows, and 21 more variables: Set7_57.NV <dbl>,
#> # Set7_59.NV <dbl>, Set7_62.NV <dbl>, Set7_55_N.VAF <dbl>,
#> # Set7_57_N.VAF <dbl>, Set7_59_N.VAF <dbl>, Set7_62_N.VAF <dbl>,
#> # Set7_55.VAF <dbl>, Set7_57.VAF <dbl>, Set7_59.VAF <dbl>, Set7_62.VAF <dbl>,
#> # alt <chr>, cosmic <chr>, function. <chr>, gene <chr>, mutlocation <chr>,
#> # ref <chr>, region <chr>, vartype <chr>, patient <chr>, id <chr>
We create separate inputs for the mutation read depth (DP, coverage or total number of reads) and the number of reads with the alternative allele (NV, number of variant reads).
# Cluster non-tail mutations with VIBER
DPs = non_tail_mutations %>% select(ends_with('DP'))
NVs = non_tail_mutations %>% select(ends_with('NV'))
colnames(DPs) = colnames(NVs) = Set7_samples
# Coverage data
print(DPs)
#> # A tibble: 11,304 x 4
#> Set7_55 Set7_57 Set7_59 Set7_62
#> <dbl> <dbl> <dbl> <dbl>
#> 1 73 66 84 82
#> 2 69 78 58 68
#> 3 75 71 94 87
#> 4 85 73 86 71
#> 5 73 54 57 98
#> 6 77 66 84 88
#> 7 53 51 68 71
#> 8 76 80 94 103
#> 9 76 82 94 100
#> 10 66 78 88 90
#> # … with 11,294 more rows
We can now run the variational inference fitting methods implemented in VIBER, to capture mixtures of Binomial distributions from non-tail SNVs.
options(easypar.parallel = FALSE)
# VIBER fit, and save RDS
viber_fit = VIBER::variational_fit(x = NVs, y = DPs, samples = 4, epsilon_conv = 1e-10, K = 5)
#> [ VIBER - variational fit ]
saveRDS(viber_fit, file = "./Set7/Set7_mobster_viber_fit.rds")
# Plot a 3x2 figure -- raw fit (all clusters)
ggarrange(
plotlist = plot(
viber_fit,
colors = get_cluster_colors(palettes = 'FantasticFox1', viber_fit)
),
ncol = 3,
nrow = 2)
As we discuss in the paper, we use a heuristic (implemented in VIBER) to filter out some clusters, and re-perform the plots.
# Apply the heuristic, and save another RDS
heuristic_fit = VIBER::choose_clusters(viber_fit, dimensions_cutoff = 1)
saveRDS(heuristic_fit, file = "./Set7/Set7_mobster_viber_fit_heuristics.rds")
# Plot a 3x2 figure -- after the heuristic
ggarrange(
plotlist = plot(
heuristic_fit,
colors = get_cluster_colors(palettes = 'FantasticFox1', heuristic_fit)
),
ncol = 3,
nrow = 2)
The standard analysis without MOBSTER clusters read counts for all SNVs.
# All mutations with VIBER
DPs = Set7_mutations %>% select(ends_with('DP'))
NVs = Set7_mutations %>% select(ends_with('NV'))
colnames(DPs) = colnames(NVs) = Set7_samples
# VIBER fit, and save RDS (one run as it takes time, K = 10 is the default as we expect more clusters here)
st_viber_fit = VIBER::variational_fit(x = NVs, y = DPs, samples = 1, epsilon_conv = 1e-8, max_iter = 100)
#> [ VIBER - variational fit ]
saveRDS(st_viber_fit, file = "./Set7/Set7_standard_fit.rds")
# Plot a 3x2 figure -- before the heuristic
ggarrange(
plotlist = plot(
st_viber_fit,
colors = get_cluster_colors(palettes = c('BottleRocket2', "Zissou1"), st_viber_fit)
),
ncol = 3,
nrow = 2)
# Apply the heuristic
st_heuristic_fit = VIBER::choose_clusters(st_viber_fit, dimensions_cutoff = 1)
saveRDS(st_heuristic_fit, file = "./Set7/Set7_standard_fit_heuristics.rds")
# Plot a 3x2 figure -- after the heuristic
ggarrange(
plotlist = plot(
st_heuristic_fit,
colors = get_cluster_colors(palettes = c('BottleRocket2', "Zissou1"), st_heuristic_fit)
),
ncol = 3,
nrow = 2)
We use an auxiliary plot assembly function that places on the top and bottom diagonal two VIBER fits, and MOBSTER fits on the diagonal of the matrix.
Without versus with MOBSTER, without the heuristic.
squareplot(
mobster_fits = mobster_fits,
viber_fit_bottom = st_viber_fit,
viber_fit_top = viber_fit,
samples_list = Set7_samples,
colors_bottom = get_cluster_colors(palettes = c('BottleRocket2', "Zissou1"), st_viber_fit),
colors_top = get_cluster_colors(palettes = 'FantasticFox1', viber_fit)
)
Without versus with MOBSTER, with the heuristic.
squareplot(
mobster_fits = mobster_fits,
viber_fit_bottom = st_heuristic_fit,
viber_fit_top = heuristic_fit,
samples_list = Set7_samples,
colors_bottom = get_cluster_colors(palettes = c('BottleRocket2', "Zissou1"), st_heuristic_fit),
colors_top = get_cluster_colors(palettes = 'FantasticFox1', heuristic_fit)
)
The output RDS objects generated by this vignette are:
dir.create("./Set6/")
# Load sample names and purity (from previous analysis, we know purity)
Set6_samples = paste0('Set6_', c(42, 44, 45:48))
Set6_purity = pio:::nmfy(Set6_samples, c(0.66, 0.72, 0.80, 0.80, 0.80, 0.80))
# Load SNV and CNA data, as for Set7
Set6_mutations = read_csv('./data/Set6_mutations.csv')
Set6_CNA = read_csv('./data/Set6_cna.csv')
# MOBSTER fits
mobster_fits = fit_mobsters(Set6_mutations, Set6_samples)
#> [ MOBSTER fit ]
#>
#>
#>
#>
#>
#> [ MOBSTER fit ]
#>
#>
#>
#>
#>
#> [ MOBSTER fit ]
#>
#>
#>
#>
#>
#> [ MOBSTER fit ]
#>
#>
#>
#>
#>
#> [ MOBSTER fit ]
#>
#>
#>
#>
#>
#> [ MOBSTER fit ]
saveRDS(mobster_fits, file = "./Set6/Set6_mobster_fits.rds")
# Non tails
non_tail_mutations = get_nontail_mutations(mutations = Set6_mutations, mobster_fits)
# VIBER, plus heuristic
DPs = non_tail_mutations %>% select(ends_with('DP'))
NVs = non_tail_mutations %>% select(ends_with('NV'))
colnames(DPs) = colnames(NVs) = Set6_samples
viber_fit = VIBER::variational_fit(x = NVs, y = DPs, samples = 4, epsilon_conv = 1e-10, K = 5)
#> [ VIBER - variational fit ]
saveRDS(viber_fit, file = "./Set6/Set6_mobster_viber_fit.rds")
heuristic_fit = VIBER::choose_clusters(viber_fit, dimensions_cutoff = 1)
saveRDS(heuristic_fit, file = "./Set6/Set6_mobster_viber_fit_heuristics.rds")
# Analysis without MOBSTER, plus heuristic
DPs = Set6_mutations %>% select(ends_with('DP'))
NVs = Set6_mutations %>% select(ends_with('NV'))
colnames(DPs) = colnames(NVs) = Set6_samples
# VIBER fit
st_viber_fit = VIBER::variational_fit(x = NVs, y = DPs, samples = 1, epsilon_conv = 1e-8, max_iter = 100)
#> [ VIBER - variational fit ]
saveRDS(st_viber_fit, file = "./Set6/Set6_standard_fit.rds")
st_heuristic_fit = VIBER::choose_clusters(st_viber_fit, dimensions_cutoff = 1)
saveRDS(st_heuristic_fit, file = "./Set6/Set6_standard_fit_heuristics.rds")
Without versus with MOBSTER, without the heuristic.
squareplot(
mobster_fits = mobster_fits,
viber_fit_bottom = st_viber_fit,
viber_fit_top = viber_fit,
samples_list = Set6_samples,
colors_bottom = get_cluster_colors(palettes = c('BottleRocket2', "Zissou1"), st_viber_fit),
colors_top = get_cluster_colors(palettes = 'FantasticFox1', viber_fit)
)
Without versus with MOBSTER, with the heuristic.
squareplot(
mobster_fits = mobster_fits,
viber_fit_bottom = st_heuristic_fit,
viber_fit_top = heuristic_fit,
samples_list = Set6_samples,
colors_bottom = get_cluster_colors(palettes = c('BottleRocket2', "Zissou1"), st_heuristic_fit),
colors_top = get_cluster_colors(palettes = 'FantasticFox1', heuristic_fit)
)